library(tidyverse)
library(here)
library(janitor)
library(devtools)
devtools::install_github("hadley/emo")
library(emo)


Motivation and 📦s


TLDR

Problem: survey data with many Likert-style factor variables that vary by question 🙄
Pattern: several of the factor variables have the same levels 🧐
Solution: change all those variables at once! 🤩


Inspect the problem (simplified example)

Note:

  • Already named variables that have the same factor levels with the same prefix
  • This snippet is a remix of code from Emily’s talk for inspecting numeric variables (thanks Brooke Watson, remix 👸, for inspo)
pre <- suppressMessages(read_csv(here("r_ladies_pre_data.csv")))
pre %>% 
 select_if(is.character) %>% 
 select(contains("rye_"), 
        contains("hk_")) %>% 
 map(~tabyl(.))
## $rye_readscitext
##            .  n percent valid_percent
##     Emerging 11    0.11    0.12643678
##  Experienced 62    0.62    0.71264368
##       Expert 12    0.12    0.13793103
##     Very new  2    0.02    0.02298851
##         <NA> 13    0.13            NA
## 
## $rye_learnscivocab
##            .  n percent valid_percent
##     Emerging 12    0.12    0.13793103
##  Experienced 62    0.62    0.71264368
##       Expert 11    0.11    0.12643678
##     Very new  2    0.02    0.02298851
##         <NA> 13    0.13            NA
## 
## $rye_scitext
##            .  n percent valid_percent
##     Emerging 26    0.26    0.29885057
##  Experienced 49    0.49    0.56321839
##       Expert  7    0.07    0.08045977
##     Very new  5    0.05    0.05747126
##         <NA> 13    0.13            NA
## 
## $hk_ellparticipateinsci
##                         .  n percent valid_percent
##             Knowledgeable 60    0.60    0.68965517
##  Not at all knowledgeable  2    0.02    0.02298851
##         Not knowledgeable  8    0.08    0.09195402
##        Very knowledgeable 17    0.17    0.19540230
##                      <NA> 13    0.13            NA
## 
## $hk_swdparticipateinsci
##                         .  n percent valid_percent
##             Knowledgeable 59    0.59    0.67816092
##  Not at all knowledgeable  3    0.03    0.03448276
##         Not knowledgeable 11    0.11    0.12643678
##        Very knowledgeable 14    0.14    0.16091954
##                      <NA> 13    0.13            NA
## 
## $hk_teaching_science
##                       .  n percent valid_percent
##           Knowledgeable 57    0.57     0.6551724
##  Not very knowledgeable 12    0.12     0.1379310
##      Very knowledgeable 18    0.18     0.2068966
##                    <NA> 13    0.13            NA
## 
## $hk_teaching_sci_field_trial_units
##                   .  n percent valid_percent
##       Knowledgeable 58    0.58     0.6666667
##   Not knowledgeable 20    0.20     0.2298851
##  Very knowledgeable  9    0.09     0.1034483
##                <NA> 13    0.13            NA


Set correct levels

rye_levels <- c("Very new", "Emerging", "Experienced", "Expert")
hk_levels <- c("Not at all knowledgeable", "Not knowledgeable", "Knowledgeable", "Very knowledgeable")


Apply levels

pre_f <- 
  pre %>% 
  mutate_at(vars(contains("rye_")), ~ factor(., levels = rye_levels)) %>% 
  mutate_at(vars(contains("hk_")), ~ factor(., levels = hk_levels)) 


Check levels

pre_f %>% 
 select_if(is.factor) %>% 
 select(contains("rye_"), contains("hk_")) %>% 
 map(~tabyl(.))
## $rye_readscitext
##            .  n percent valid_percent
##     Very new  2    0.02    0.02298851
##     Emerging 11    0.11    0.12643678
##  Experienced 62    0.62    0.71264368
##       Expert 12    0.12    0.13793103
##         <NA> 13    0.13            NA
## 
## $rye_learnscivocab
##            .  n percent valid_percent
##     Very new  2    0.02    0.02298851
##     Emerging 12    0.12    0.13793103
##  Experienced 62    0.62    0.71264368
##       Expert 11    0.11    0.12643678
##         <NA> 13    0.13            NA
## 
## $rye_scitext
##            .  n percent valid_percent
##     Very new  5    0.05    0.05747126
##     Emerging 26    0.26    0.29885057
##  Experienced 49    0.49    0.56321839
##       Expert  7    0.07    0.08045977
##         <NA> 13    0.13            NA
## 
## $hk_ellparticipateinsci
##                         .  n percent valid_percent
##  Not at all knowledgeable  2    0.02    0.02298851
##         Not knowledgeable  8    0.08    0.09195402
##             Knowledgeable 60    0.60    0.68965517
##        Very knowledgeable 17    0.17    0.19540230
##                      <NA> 13    0.13            NA
## 
## $hk_swdparticipateinsci
##                         .  n percent valid_percent
##  Not at all knowledgeable  3    0.03    0.03448276
##         Not knowledgeable 11    0.11    0.12643678
##             Knowledgeable 59    0.59    0.67816092
##        Very knowledgeable 14    0.14    0.16091954
##                      <NA> 13    0.13            NA
## 
## $hk_teaching_science
##                         .  n percent valid_percent
##  Not at all knowledgeable  0    0.00          0.00
##         Not knowledgeable  0    0.00          0.00
##             Knowledgeable 57    0.57          0.76
##        Very knowledgeable 18    0.18          0.24
##                      <NA> 25    0.25            NA
## 
## $hk_teaching_sci_field_trial_units
##                         .  n percent valid_percent
##  Not at all knowledgeable  0    0.00     0.0000000
##         Not knowledgeable 20    0.20     0.2298851
##             Knowledgeable 58    0.58     0.6666667
##        Very knowledgeable  9    0.09     0.1034483
##                      <NA> 13    0.13            NA


🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 BUT WAIT…

Post Survey

Here, things get interesting! The dear people that wrote this survey decided to phrase different levels of the same type of question in different ways

Inspect the problem

post <- suppressMessages(read_csv(here("post_data.csv")))
post %>% 
 select(contains("well_")) %>% 
 map(~tabyl(.))
## $well_meet_ell
##                .  n    percent valid_percent
##    Not very well 20 0.10582011    0.12121212
##  Not well at all  8 0.04232804    0.04848485
##        Very Well 47 0.24867725    0.28484848
##             Well 90 0.47619048    0.54545455
##             <NA> 24 0.12698413            NA
## 
## $well_meet_sped
##                .   n    percent valid_percent
##    Not very well  30 0.15873016    0.17543860
##  Not well at all   7 0.03703704    0.04093567
##        Very Well  30 0.15873016    0.17543860
##             Well 104 0.55026455    0.60818713
##             <NA>  18 0.09523810            NA
## 
## $well_meet_low
##                .   n    percent valid_percent
##    Not very well  28 0.14814815    0.15555556
##  Not well at all   3 0.01587302    0.01666667
##        Very Well  34 0.17989418    0.18888889
##             Well 115 0.60846561    0.63888889
##             <NA>   9 0.04761905            NA
## 
## $well_meet_high
##                .   n     percent valid_percent
##    Not very well   5 0.026455026   0.027472527
##  Not well at all   1 0.005291005   0.005494505
##        Very Well 135 0.714285714   0.741758242
##             Well  41 0.216931217   0.225274725
##             <NA>   7 0.037037037            NA
## 
## $well_teacher_guide
##                .  n    percent valid_percent
##        Just okay 26 0.13756614    0.14364641
##  Not at all well  6 0.03174603    0.03314917
##        Very well 70 0.37037037    0.38674033
##             Well 79 0.41798942    0.43646409
##             <NA>  8 0.04232804            NA
## 
## $well_s_reaction
##                .  n    percent valid_percent
##        Just okay 15 0.07936508    0.08333333
##  Not at all well  3 0.01587302    0.01666667
##        Very well 97 0.51322751    0.53888889
##             Well 65 0.34391534    0.36111111
##             <NA>  9 0.04761905            NA
## 
## $well_design_proposal
##                .   n    percent valid_percent
##      Not so well   4 0.02116402     0.1538462
##  Not well at all   4 0.02116402     0.1538462
##        Very well   6 0.03174603     0.2307692
##             Well  12 0.06349206     0.4615385
##             <NA> 163 0.86243386            NA

Make all factor var responses lower case

post_f <- 
  post %>% 
  mutate_at(vars(contains("well_")), ~ tolower(.))

Find the patterns, make patterns and replacements,

then gsub them all away!

pattern_4 <- "very.*"
replacement_4 <- 4

pattern_1 <- ".*at all.*"
replacement_1 <- 1

pattern_2 <- "not 4"
replacement_2 <- 2

pattern_3 <- "somewhat.*"
replacement_3 <- 3

pattern_3w <- "well"
replacement_3w <- 3

post_f <- 
  post_f %>% 
  mutate_at(vars(-contains("_comment")), ~ gsub(pattern_4, replacement_4, .)) %>% 
  mutate_at(vars(-contains("_comment")), ~ gsub(pattern_1, replacement_1, .)) %>% 
  mutate_at(vars(-contains("_comment")), ~ gsub(pattern_2, replacement_2, .)) %>% 
  mutate_at(vars(-contains("_comment")), ~ gsub(pattern_3, replacement_3, .)) %>% 
  mutate_at(vars(-contains("_comment")), ~ gsub(pattern_3w, replacement_3w, .)) %>% 
  mutate(well_teacher_guide = 
           ifelse(well_teacher_guide == "just okay", "2", well_teacher_guide),
         well_s_reaction = 
           ifelse(well_s_reaction == "just okay", "2", well_s_reaction),
         well_design_proposal = 
           ifelse(well_design_proposal == "not so 3", "2", well_design_proposal))

Check levels

post_f %>% 
 select(contains("well_")) %>% 
 map(~tabyl(.))
## $well_meet_ell
##     .  n    percent valid_percent
##     1  8 0.04232804    0.04848485
##     2 20 0.10582011    0.12121212
##     3 90 0.47619048    0.54545455
##     4 47 0.24867725    0.28484848
##  <NA> 24 0.12698413            NA
## 
## $well_meet_sped
##     .   n    percent valid_percent
##     1   7 0.03703704    0.04093567
##     2  30 0.15873016    0.17543860
##     3 104 0.55026455    0.60818713
##     4  30 0.15873016    0.17543860
##  <NA>  18 0.09523810            NA
## 
## $well_meet_low
##     .   n    percent valid_percent
##     1   3 0.01587302    0.01666667
##     2  28 0.14814815    0.15555556
##     3 115 0.60846561    0.63888889
##     4  34 0.17989418    0.18888889
##  <NA>   9 0.04761905            NA
## 
## $well_meet_high
##     .   n     percent valid_percent
##     1   1 0.005291005   0.005494505
##     2   5 0.026455026   0.027472527
##     3  41 0.216931217   0.225274725
##     4 135 0.714285714   0.741758242
##  <NA>   7 0.037037037            NA
## 
## $well_teacher_guide
##     .  n    percent valid_percent
##     1  6 0.03174603    0.03314917
##     2 26 0.13756614    0.14364641
##     3 79 0.41798942    0.43646409
##     4 70 0.37037037    0.38674033
##  <NA>  8 0.04232804            NA
## 
## $well_s_reaction
##     .  n    percent valid_percent
##     1  3 0.01587302    0.01666667
##     2 15 0.07936508    0.08333333
##     3 65 0.34391534    0.36111111
##     4 97 0.51322751    0.53888889
##  <NA>  9 0.04761905            NA
## 
## $well_design_proposal
##     .   n    percent valid_percent
##     1   4 0.02116402     0.1538462
##     2   4 0.02116402     0.1538462
##     3  12 0.06349206     0.4615385
##     4   6 0.03174603     0.2307692
##  <NA> 163 0.86243386            NA

Final product example



🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉 🎉


Epilogue

Thank you to R Ladies all around the world!!